﻿# Takes a directory path, hashes the prolific ID, saves the resulting dataframe as a pickle in the "data_pickles" folder

import pandas as pd
import numpy as np
import hashlib
import re
from os import listdir
from os.path import isfile, join, splitext, basename
import math


csv_file_path = '../data/'
destination_file_path = '../data/'
destination_pickled_file_path = '../data_pickles/'
fileNames = [f for f in listdir(csv_file_path) if (isfile(join(csv_file_path,f)) and re.search(r'\.csv$',f)) and not(re.search(r'prolific',f)) and not(re.search(r'comments',f))]

print(fileNames)

counter = 1000
for myFile in fileNames:
    print("processing: " + myFile)

    # First non-hashed

    myFrame = pd.DataFrame.from_csv(join(csv_file_path,myFile),index_col=None)
    # find prolific IDs that do not conform
    listOfProlificIDs = myFrame.prolificID.unique()
    for pID in listOfProlificIDs:
        if (isinstance(pID,str)):   
            if re.search(r'test',pID):
                print("Test *************")
                print(pID)
                myFrame = myFrame[(myFrame.prolificID != pID)]
                print('Deleted')
            elif re.search(r'[\.|[g-z]|[G-Z]|\\]',pID):
                print("Bad Prolific ID ##")
                print(pID)
                # fix the bad prolific ID
                myFrame.ix[myFrame.prolificID==pID,'prolificID'] = str(counter) # myFrame.ix[ myFrame.pID==pID ,'prolificID 
                print("Renamed: "+str(counter))
                counter += 1
        else:
            print('Failed Parsing: XXXXXXXXXXXXXXXXXXXX') 
            print(pID)
            sessions = myFrame[myFrame.prolificID.isnull()].user_key.unique()
            print('unique_session_keys with null prolific ID: ',sessions)
            for session in sessions:
                # fix the bad prolific ID
                myFrame.ix[myFrame.user_key==session,'prolificID'] = str(counter) # myFrame.ix[ myFrame.pID==pID ,'prolificID 
                print('Renamed: ' + str(counter))
                counter += 1

    myFrame['prolificID'] = myFrame['prolificID'].astype(str)

    # This is the not hashed pickle
    myFrame.to_pickle(join(destination_file_path,(splitext(myFile)[0]+".p")))

    # This is the hashed pickle
    hasher = lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest() if type(x) != float else None
    myFrame['prolificID'] = myFrame['prolificID'].apply(hasher)

    myFrame.to_pickle(join(destination_pickled_file_path,(splitext(myFile)[0]+"_hashed.p")))
